scala_library(
    sources = [
        "*.scala",
    ],
    platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        "3rdparty/jvm/com/twitter/algebird:core",
        "3rdparty/jvm/com/twitter/algebird:util",
        "3rdparty/jvm/com/twitter/storehaus:algebra",
        "3rdparty/jvm/com/twitter/storehaus:core",
        "3rdparty/src/jvm/com/twitter/scalding:args",
        "3rdparty/src/jvm/com/twitter/scalding:commons",
        "3rdparty/src/jvm/com/twitter/scalding:core",
        "3rdparty/src/jvm/com/twitter/scalding:date",
        "3rdparty/src/jvm/com/twitter/scalding:db",
        "3rdparty/src/jvm/com/twitter/scalding:parquet",
        "ann/src/main/scala/com/twitter/ann/hnsw",
        "ann/src/main/scala/com/twitter/ann/scalding/offline",
        "ann/src/main/scala/com/twitter/ann/util",
        "geoduck/hadoop/scalding/datasets:userlocation-scala",
        "iesource/common/src/main/scala/com/twitter/iesource/common/util",
        "iesource/processing/events/src/main/scala/com/twitter/iesource/processing/events/batch" +
        ":server_engagements-scala",
        "iesource/thrift",
        "src/java/com/twitter/ml/api/constant",
        "src/scala/com/twitter/ml/api/util",
        "src/scala/com/twitter/ml/featurestore/catalog/entities/core",
        "src/scala/com/twitter/ml/featurestore/catalog/features/geo",
        "src/scala/com/twitter/ml/featurestore/lib/batch",
        "src/scala/com/twitter/scalding_internal/dalv2",
        "src/scala/com/twitter/scalding_internal/dalv2/dataset",
        "src/scala/com/twitter/scalding_internal/db",
        "src/scala/com/twitter/scalding_internal/db/jdbc",
        "src/scala/com/twitter/scalding_internal/error_handling",
        "src/scala/com/twitter/scalding_internal/job",
        "src/scala/com/twitter/scalding_internal/job/analytics_batch",
        "src/scala/com/twitter/scalding_internal/multiformat",
        "src/scala/com/twitter/scalding_internal/source",
        "src/scala/com/twitter/scalding_internal/source/lzo_scrooge",
        "src/scala/com/twitter/scalding_internal/typed",
        "src/scala/com/twitter/simclusters_v2/hdfs_sources",
        "src/scala/com/twitter/simclusters_v2/scalding/common",
        "src/thrift/com/twitter/ml/api:data-java",
        "src/thrift/com/twitter/ml/api:interpretable-model-java",
        "tweetsource/public_tweets/src/main/scala/com/twitter/tweetsource/public_tweets:public_tweets-scala",
        "twml/runtime/src/main/scala/com/twitter/twml/runtime/scalding",
        "util/util-core:scala",
        "util/util-stats/src/main/scala",
    ],
)

scalding_job(
    name = "tweet-embedding-generation-adhoc-job",
    main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationAdhocJob",
    args = [
        "--dateRange 2021-10-30T00 2021-10-30T01",
        "--model_name model",
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_model_1104/1635973177/tweet_tower_with_signature",
        "--concurrency_level 60",
        "--embedding_dimension 128",
        "--expected_elements 30000000",
        "--max_M 20",
        "--ef_construction 200",
        "--tweet_embedding_name output",
        "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/test_11_04_adhoc",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("submitter.tier", "preemptible"),
        ("hadoop.map.jvm.total-memory", "6144m"),
    ],
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "tweet-embedding-generation-batch-job",
    main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationBatchJob",
    args = [
        "--model_name model",
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0119_1day_0110_3l_5e_f2v_gpu_resave/tweet_tower_with_signature",
        "--concurrency_level 60",
        "--embedding_dimension 128",
        "--expected_elements 5000000",
        "--max_M 40",
        "--ef_construction 800",
        "--tweet_embedding_name output",
        "--f2v_input.feature_store_embedding Follow2VecProducerEmbedding200Dataset",
        "--f2v_input.feature_store_major_version 20210708",
        "--minFavCount 32",
        "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/0125_batch_index_f2v_minfav",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("hadoop.map.jvm.total-memory", "6144m"),
        ("hadoop.submitter.disk", "100g"),
    ],
    cron = "*/5 * * * *",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "tweet-embedding-generation-batch-job-alternate",
    main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationBatchJobAlternate",
    args = [
        "--model_name model",
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_331_329_1e_128em_b128_hn10_all_gpu/tweet_tower_with_signature",
        "--concurrency_level 60",
        "--embedding_dimension 128",
        "--expected_elements 5000000",
        "--max_M 40",
        "--ef_construction 800",
        "--tweet_embedding_name output",
        "--f2v_input.feature_store_embedding Follow2VecProducerEmbedding200Dataset",
        "--f2v_input.feature_store_major_version 20210708",
        "--minFavCount 100",
        "--indexAllTweets",
        "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/0401_batch_index_f2v_cosine_all_tweets",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("hadoop.map.jvm.total-memory", "6144m"),
        ("hadoop.submitter.disk", "100g"),
    ],
    contact = "no-reply@twitter.com",
    cron = "*/5 * * * *",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "tweet-embedding-generation-batch-job-experimental",
    main = "com.twitter.simclusters_v2.scalding.mbcg.TweetEmbeddingGenerationBatchJobExperimental",
    args = [
        "--model_name model",
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0127_1day_0110_3l_10e_128e_normf2v_nocosine_gpu/tweet_tower_with_signature",
        "--concurrency_level 60",
        "--embedding_dimension 128",
        "--expected_elements 5000000",
        "--max_M 40",
        "--ef_construction 800",
        "--tweet_embedding_name output",
        "--f2v_input.feature_store_embedding Follow2VecProducerEmbedding200Dataset",
        "--f2v_input.feature_store_major_version 20210708",
        "--minFavCount 32",
        "--ann_output_path hdfs:///atla/proc/user/cassowary/explore_mbcg/ann_index/0128_f2v_1week_batch_index",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("hadoop.map.jvm.total-memory", "6144m"),
        ("hadoop.submitter.disk", "100g"),
    ],
    contact = "no-reply@twitter.com",
    cron = "*/5 * * * *",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "user-embedding-generation-adhoc-job",
    main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationAdhocJob",
    args = [
        "--dateRange 2021-12-01T00 2021-12-01T01",
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_1202_logs_100m_b64_hn10_1127_video_persistent/user_tower_with_signature",
        "--embedding_dimension 128",
        "--user_embedding_name output",
        "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/1207_adhoc_model_store",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("submitter.tier", "preemptible"),
        ("hadoop.map.jvm.total-memory", "6144m"),
    ],
    contact = "no-reply@twitter.com",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
        "known-to-fail-jira:SD-20253",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "user-embedding-generation-batch-job",
    main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationBatchJob",
    args = [
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0119_1day_0110_3l_5e_f2v_gpu_resave/user_tower_with_signature",
        "--embedding_dimension 128",
        "--user_embedding_name output",
        "--f2v_input.feature_store_embedding FollowBasedConsumerFollow2VecAvgEmbedding200Dataset",
        "--f2v_input.feature_store_major_version 20210708",
        "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/0125_refreshed_model_store_f2v",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("submitter.tier", "preemptible"),
        ("hadoop.map.jvm.total-memory", "6144m"),
    ],
    contact = "no-reply@twitter.com",
    cron = "*/30 * * * *",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "user-embedding-generation-batch-job-alternate",
    main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationBatchJobAlternate",
    args = [
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_331_329_1e_128em_b128_hn10_all_gpu/user_tower_with_signature",
        "--embedding_dimension 128",
        "--user_embedding_name output",
        "--f2v_input.feature_store_embedding FollowBasedConsumerFollow2VecAvgEmbedding200Dataset",
        "--f2v_input.feature_store_major_version 20210708",
        "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/0401_refreshed_model_store_all",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("submitter.tier", "preemptible"),
        ("hadoop.map.jvm.total-memory", "6144m"),
    ],
    contact = "no-reply@twitter.com",
    cron = "*/30 * * * *",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)

scalding_job(
    name = "user-embedding-generation-batch-job-experimental",
    main = "com.twitter.simclusters_v2.scalding.mbcg.UserEmbeddingGenerationBatchJobExperimental",
    args = [
        "--model_path hdfs:///atla/proc/user/cassowary/explore_mbcg/models/tfx_0127_1day_0110_3l_10e_128e_normf2v_nocosine_gpu/user_tower_with_signature",
        "--embedding_dimension 128",
        "--user_embedding_name output",
        "--f2v_input.feature_store_embedding FollowBasedConsumerFollow2VecAvgEmbedding200Dataset",
        "--f2v_input.feature_store_major_version 20210708",
        "--kvs_output_path /user/cassowary/explore_mbcg/user_kvs_store/0328_f2v_cosine_all_tweets_model_store",
    ],
    config = [
        ("hadoop.submitter.cpu", 60),
        ("hadoop.submitter.jvm.total-memory", "256g"),
        ("submitter.tier", "preemptible"),
        ("hadoop.map.jvm.total-memory", "6144m"),
    ],
    contact = "no-reply@twitter.com",
    cron = "*/30 * * * *",
    hadoop_cluster = "atla-proc3",
    platform = "java8",
    role = "cassowary",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [":mbcg"],
)
